import warnings
warnings.filterwarnings('ignore')
import time
import glob
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import random
import sklearn
from sklearn.utils import class_weight
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import decomposition, ensemble
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, recall_score, f1_score
from sklearn.metrics import make_scorer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
import np_utils
from xgboost import XGBClassifier
import string
from tqdm import tqdm
tqdm().pandas()
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('words')
import collections
import matplotlib.cm as cm
from matplotlib import rcParams
import time
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from collections import defaultdict
import matplotlib.ticker as ticker
import matplotlib.cm as cm
import matplotlib as mpl
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import random
import re
import pandas as pd
from nltk import sent_tokenize
from tqdm import tqdm
#from albumentations.core.transforms_interface import DualTransform, BasicTransform
import openpyxl
from wordcloud import WordCloud, STOPWORDS
from sklearn.decomposition import PCA
0it [00:00, ?it/s] [nltk_data] Downloading package words to [nltk_data] C:\Users\Create\AppData\Roaming\nltk_data... [nltk_data] Package words is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Create\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\Create\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] C:\Users\Create\AppData\Roaming\nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date!
ticket = pd.read_excel('C:\\Users\\Create\\Downloads\\Great_Learning\\Final_Project\\input_data.xlsx', index_col=0)
ticket.reset_index(inplace = True)
ticket
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 0 | login issue | -verified user details.(employee# & manager na... | spxjnwir pjlcoqds | GRP_0 |
| 1 | outlook | _x000D_\n_x000D_\nreceived from: hmjdrvpb.komu... | hmjdrvpb komuaywn | GRP_0 |
| 2 | cant log in to vpn | _x000D_\n_x000D_\nreceived from: eylqgodm.ybqk... | eylqgodm ybqkwiam | GRP_0 |
| 3 | unable to access hr_tool page | unable to access hr_tool page | xbkucsvz gcpydteq | GRP_0 |
| 4 | skype error | skype error | owlgqjme qhcozdfx | GRP_0 |
| ... | ... | ... | ... | ... |
| 8495 | emails not coming in from zz mail | _x000D_\n_x000D_\nreceived from: avglmrts.vhqm... | avglmrts vhqmtiua | GRP_29 |
| 8496 | telephony_software issue | telephony_software issue | rbozivdq gmlhrtvp | GRP_0 |
| 8497 | vip2: windows password reset for tifpdchb pedx... | vip2: windows password reset for tifpdchb pedx... | oybwdsgx oxyhwrfz | GRP_0 |
| 8498 | machine não está funcionando | i am unable to access the machine utilities to... | ufawcgob aowhxjky | GRP_62 |
| 8499 | an mehreren pc`s lassen sich verschiedene prgr... | an mehreren pc`s lassen sich verschiedene prgr... | kqvbrspl jyzoklfx | GRP_49 |
8500 rows × 4 columns
df_processing = ticket.copy()
df_processing['text'] = df['Short description']+" "+df["Description"]
df_processing = df_processing[['text','Assignment group']]
df_processing['Assignment group']= df_processing['Assignment group'].str.replace('GRP','')
df_processing['Assignment group']= df_processing['Assignment group'].str.replace('_','')
df_processing['Assignment group']= df_processing['Assignment group'].astype(int)
df_processing['text']= df_processing['text'].astype(str)
df_processing['Assignment group'].value_counts()
0 3976
8 661
24 289
12 257
9 252
...
61 1
35 1
73 1
64 1
67 1
Name: Assignment group, Length: 74, dtype: int64
top_20 = df_processing['Assignment group'].value_counts().nlargest(20).reset_index()
plt.subplots(figsize=(20,5))
sns.barplot(x='index', y='Assignment group', data=top_20)
plt.xlabel('Assignment Group')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.title('Assignment Group Distribution')
plt.ylim(0, 5000)
plt.show();
plt.figure(1,figsize=(16,15))
df_processing['Assignment group'].value_counts().sort_values().plot(kind = 'barh')
<AxesSubplot:>
Group 0 has 46 % of records
Fewer groups have more than 1 % of records
other groups have less than 1% of records
value_count_df = df_processing['Assignment group'].value_counts().to_frame().reset_index()
value_count_df ['Per'] = df_processing['Assignment group'].value_counts(normalize=True)
GrpToBeMaintained = value_count_df[value_count_df['Assignment group'] >= 80]['index'].values
value_count_df[value_count_df['Assignment group'] >= 80]
| index | Assignment group | Per | |
|---|---|---|---|
| 0 | 0 | 3976 | 0.467765 |
| 1 | 8 | 661 | 0.003647 |
| 2 | 24 | 289 | 0.028353 |
| 3 | 12 | 257 | 0.023529 |
| 4 | 9 | 252 | 0.011765 |
| 5 | 2 | 241 | 0.015176 |
| 6 | 19 | 215 | 0.021647 |
| 7 | 3 | 200 | 0.008000 |
| 8 | 6 | 184 | 0.077765 |
| 9 | 13 | 145 | 0.029647 |
| 10 | 10 | 140 | 0.016471 |
| 11 | 5 | 129 | 0.003529 |
| 12 | 14 | 118 | 0.030235 |
| 13 | 25 | 116 | 0.017059 |
| 14 | 33 | 107 | 0.013882 |
| 15 | 4 | 100 | 0.004588 |
| 16 | 29 | 97 | 0.010000 |
| 17 | 18 | 88 | 0.009529 |
| 18 | 16 | 85 | 0.010353 |
| 19 | 17 | 81 | 0.025294 |
GrpToBeMaintained
array([ 0, 8, 24, 12, 9, 2, 19, 3, 6, 13, 10, 5, 14, 25, 33, 4, 29,
18, 16, 17], dtype=int64)
title_rating = df_processing.groupby('Assignment group').agg('count')
rating_labels = title_rating.text.sort_values().index
rating_counts = title_rating.text.sort_values()
plt.figure(1, figsize=(40,70))
the_grid = GridSpec(2, 2)
cmap = plt.get_cmap('Spectral')
colors = [cmap(i) for i in np.linspace(0, 1, 8)]
plt.subplot(the_grid[0, 1], aspect=1, title='Percentage of Each Group')
type_show_ids = plt.pie(rating_counts, labels=rating_labels, autopct='%1.1f%%', shadow=True, colors=colors)
plt.show()
def assignGroup(x):
if x in GrpToBeMaintained:
val = x
else:
val = 100
return val
df_sampled = df_processing[ df_processing['Assignment group'].isin(GrpToBeMaintained)]
df_sampled .isnull().sum()
text 0 Assignment group 0 dtype: int64
df_sampled
| text | Assignment group | |
|---|---|---|
| 0 | login issue -verified user details.(employee# ... | 0 |
| 1 | outlook _x000D_\n_x000D_\nreceived from: hmjdr... | 0 |
| 2 | cant log in to vpn _x000D_\n_x000D_\nreceived ... | 0 |
| 3 | unable to access hr_tool page unable to access... | 0 |
| 4 | skype error skype error | 0 |
| ... | ... | ... |
| 8493 | erp fi - ob09, two accounts to be added i am ... | 10 |
| 8494 | tablet needs reimaged due to multiple issues w... | 3 |
| 8495 | emails not coming in from zz mail _x000D_\n_x0... | 29 |
| 8496 | telephony_software issue telephony_software issue | 0 |
| 8497 | vip2: windows password reset for tifpdchb pedx... | 0 |
7481 rows × 2 columns
def wl(text):
return len(text.split(" "))
fig_df = pd.DataFrame()
fig_df['word_length']=df_sampled['text'].apply(wl)
fig_df['char_length']=df_sampled['text'].apply(len)
fig_df['text']= df_sampled['text']
fig_df['Assignment group']= df_sampled['Assignment group']
fig_df[["text","word_length"]].sort_values(by = "word_length",ascending = False).head(10)
| text | word_length | |
|---|---|---|
| 7345 | security incidents - ( sw #in33501789 ) : broa... | 1431 |
| 4089 | security incidents - ( sw #in33895560 ) : mage... | 1408 |
| 7989 | security incidents - ( dsw #in33407676 ) : tra... | 1364 |
| 7997 | security incidents - ( sw #in33544563 ) : poss... | 1303 |
| 3965 | security incidents - ( #in33809307 ) : possibl... | 1176 |
| 7984 | security incidents - ( dsw #in33390850 ) : sus... | 1027 |
| 7982 | security incidents - ( dsw #in33390850 ) : sus... | 1027 |
| 5092 | security incidents - ( #in33578632) : suspicio... | 958 |
| 5433 | security incidents - ( #in33765965 ) : possibl... | 934 |
| 7647 | security incidents - ( #in33578632) : suspicio... | 897 |
fig_df[["text","word_length"]].sort_values(by = "word_length",ascending = True).head(10)
| text | word_length | |
|---|---|---|
| 3383 | nan | 1 |
| 4395 | nan | 1 |
| 3906 | nan | 1 |
| 3910 | nan | 1 |
| 3915 | nan | 1 |
| 3921 | nan | 1 |
| 3924 | nan | 1 |
| 4341 | nan | 1 |
| 3432 | dds dss | 2 |
| 1860 | s s | 2 |
fig_df[["text","char_length"]].sort_values(by = "char_length",ascending = False).head()
| text | char_length | |
|---|---|---|
| 7345 | security incidents - ( sw #in33501789 ) : broa... | 14544 |
| 4089 | security incidents - ( sw #in33895560 ) : mage... | 12032 |
| 7984 | security incidents - ( dsw #in33390850 ) : sus... | 11372 |
| 7989 | security incidents - ( dsw #in33407676 ) : tra... | 10450 |
| 5092 | security incidents - ( #in33578632) : suspicio... | 10212 |
fig_df[["text","char_length"]].sort_values(by = "char_length",ascending = True).head()
| text | char_length | |
|---|---|---|
| 3924 | nan | 3 |
| 3921 | nan | 3 |
| 3383 | nan | 3 |
| 4341 | nan | 3 |
| 3915 | nan | 3 |
df_nan = fig_df.loc[fig_df['word_length'] == 1]
#df.loc[df['column_name'] == some_value]
#df_processing[df_processing['text'].isna()]
df_nan
| word_length | char_length | text | Assignment group | |
|---|---|---|---|---|
| 3383 | 1 | 3 | nan | 0 |
| 3906 | 1 | 3 | nan | 0 |
| 3910 | 1 | 3 | nan | 0 |
| 3915 | 1 | 3 | nan | 0 |
| 3921 | 1 | 3 | nan | 0 |
| 3924 | 1 | 3 | nan | 0 |
| 4341 | 1 | 3 | nan | 0 |
| 4395 | 1 | 3 | nan | 0 |
df_sampled.drop(df_nan.index)
| text | Assignment group | |
|---|---|---|
| 0 | login issue -verified user details.(employee# ... | 0 |
| 1 | outlook _x000D_\n_x000D_\nreceived from: hmjdr... | 0 |
| 2 | cant log in to vpn _x000D_\n_x000D_\nreceived ... | 0 |
| 3 | unable to access hr_tool page unable to access... | 0 |
| 4 | skype error skype error | 0 |
| ... | ... | ... |
| 8493 | erp fi - ob09, two accounts to be added i am ... | 10 |
| 8494 | tablet needs reimaged due to multiple issues w... | 3 |
| 8495 | emails not coming in from zz mail _x000D_\n_x0... | 29 |
| 8496 | telephony_software issue telephony_software issue | 0 |
| 8497 | vip2: windows password reset for tifpdchb pedx... | 0 |
7473 rows × 2 columns
fig_df['word_length'].hist()
<AxesSubplot:>
fig_df['char_length'].hist()
<AxesSubplot:>
fig_df.groupby('Assignment group').agg({'word_length': ['mean', 'min', 'max']})
| word_length | |||
|---|---|---|---|
| mean | min | max | |
| Assignment group | |||
| 0 | 23.421278 | 1 | 478 |
| 2 | 106.012448 | 8 | 1431 |
| 3 | 41.390000 | 6 | 958 |
| 4 | 54.640000 | 6 | 304 |
| 5 | 37.984496 | 8 | 412 |
| 6 | 27.798913 | 9 | 268 |
| 8 | 75.417549 | 5 | 379 |
| 9 | 29.337302 | 8 | 449 |
| 10 | 34.621429 | 8 | 158 |
| 12 | 47.105058 | 4 | 1027 |
| 13 | 53.903448 | 4 | 252 |
| 14 | 44.381356 | 10 | 348 |
| 16 | 43.164706 | 9 | 192 |
| 17 | 13.358025 | 10 | 68 |
| 18 | 59.704545 | 10 | 361 |
| 19 | 32.004651 | 4 | 618 |
| 24 | 14.650519 | 8 | 121 |
| 25 | 37.612069 | 6 | 215 |
| 29 | 45.072165 | 7 | 190 |
| 33 | 32.121495 | 4 | 264 |
fig_df.groupby('Assignment group').agg({'char_length': ['mean', 'min', 'max']})
| char_length | |||
|---|---|---|---|
| mean | min | max | |
| Assignment group | |||
| 0 | 176.196932 | 3 | 3463 |
| 2 | 879.921162 | 37 | 14544 |
| 3 | 293.905000 | 39 | 10212 |
| 4 | 362.280000 | 37 | 1691 |
| 5 | 273.480620 | 89 | 1784 |
| 6 | 231.989130 | 55 | 1431 |
| 8 | 462.183056 | 42 | 1906 |
| 9 | 233.777778 | 53 | 1999 |
| 10 | 280.885714 | 55 | 1059 |
| 12 | 380.424125 | 29 | 11372 |
| 13 | 375.827586 | 31 | 1607 |
| 14 | 316.254237 | 47 | 2075 |
| 16 | 336.400000 | 67 | 1525 |
| 17 | 111.839506 | 82 | 436 |
| 18 | 424.284091 | 60 | 2323 |
| 19 | 248.944186 | 25 | 7631 |
| 24 | 114.525952 | 45 | 1113 |
| 25 | 312.448276 | 49 | 1911 |
| 29 | 334.865979 | 67 | 1680 |
| 33 | 237.383178 | 29 | 1458 |
sns.scatterplot(x='char_length',y='word_length',data=fig_df)
<AxesSubplot:xlabel='char_length', ylabel='word_length'>
stopwords = set(STOPWORDS)
def show_wordcloud(data, title = None):
wordcloud = WordCloud(
background_color='black',
stopwords=stopwords,
max_words=200,
max_font_size=40,
scale=3,
random_state=1 # chosen at random by flipping a coin; it was heads
).generate(str(data))
fig = plt.figure(1, figsize=(15, 15))
plt.axis('off')
if title:
fig.suptitle(title, fontsize=20)
fig.subplots_adjust(top=2.3)
plt.imshow(wordcloud)
plt.show()
#show_wordcloud(df['reviews.text'])
show_wordcloud(df_processing['text'])
STOPWORDS = set(STOPWORDS)
words = set(nltk.corpus.words.words())
PUNCT_TO_REMOVE = string.punctuation
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
- Conversion into Lower case
- Remove URL from input
- Remove html string from input
- Remove Emoji from input
- Remove special character from input
- Remove punctuation from input
- Remove Meaningless words
- Remove stopwords
- Lemmatisation
# Functions for preprocessing
def remove_upper_case( text):
'''
Function to transform upper string in title words
@param text: (str) text
@return: (str) text without upper words
'''
sentences = text.split("\n")
new_sentences = []
for i in sentences:
words = text.split()
stripped = [w.title() if w.isupper() else w for w in words]
new_sentences.append(" ".join(stripped))
return "\n".join(new_sentences)
def remove_URL( text):
'''
Function to remove url from text.
@param text: (str) sentence
@return: (str) clean text
'''
url = re.compile(r'https?://\S+|www\.\S+')
return url.sub(r'',text)
def remove_html( text):
'''
Function regex to clean text from html balises.
@param text: (str) sentence
@return: (str) clean text
'''
html=re.compile(r'<.*?>')
return html.sub(r'',text)
def remove_emoji( text):
'''
Function to remove emojis, symbols and pictograms etc from text
@param text: (str) sentences
@return: (str) clean text
'''
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
def remove_special_char( text):
'''
Function to remove emojis, symbols and pictograms etc from text
@param text: (str) sentences
@return: (str) clean text
'''
spcl_char_pattern = re.compile(r'[^`~!@#$%^&*()_+={}\[\]|\\:;“’<,>.?๐฿]*$')
return spcl_char_pattern.sub(r'',text)
return spcl_char_pattern.sub(r'', text)
def clean_sent_org(sent):
return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
if w.lower() in words or not w.isalpha())
def clean_sent(sent):
return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
if w.lower() in words )
def remove_punctuation(text):
"""custom function to remove the punctuation"""
return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
def remove_stopwords(text):
"""custom function to remove the stopwords"""
return " ".join([word for word in str(text).split() if word not in STOPWORDS])
def lemmatize_words(text):
pos_tagged_text = nltk.pos_tag(text.split())
return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
#df["text_lemmatized"] = df["text"].apply(lambda text: lemmatize_words(text))
TEXT = 'text'
df_sampled[TEXT] = df_sampled[TEXT].apply(remove_upper_case)
df_sampled[TEXT] = df_sampled[TEXT].apply(remove_URL)
df_sampled[TEXT] = df_sampled[TEXT].apply(remove_html)
df_sampled[TEXT] = df_sampled[TEXT].apply(remove_emoji)
df_sampled[TEXT] = df_sampled[TEXT].apply(remove_special_char)
df_sampled[TEXT] = df_sampled[TEXT].apply(clean_sent)
df_sampled[TEXT] = df_sampled[TEXT].apply( remove_punctuation)
df_sampled[TEXT] = df_sampled[TEXT].apply( remove_stopwords)
df_sampled[TEXT] = df_sampled[TEXT].apply(lemmatize_words)
df_sampled[TEXT] = df_sampled[TEXT].str.strip().replace('', 'NaN')
df_sampled[df_sampled[TEXT]=='NaN']
| text | Assignment group | |
|---|---|---|
| 4 | NaN | 0 |
| 5 | NaN | 0 |
| 8 | NaN | 0 |
| 10 | NaN | 0 |
| 13 | NaN | 0 |
| ... | ... | ... |
| 8473 | NaN | 0 |
| 8474 | NaN | 0 |
| 8485 | NaN | 0 |
| 8489 | NaN | 0 |
| 8494 | NaN | 3 |
1545 rows × 2 columns
df_sampled = df_sampled[df_sampled[TEXT] != 'NaN']
df_sampled
| text | Assignment group | |
|---|---|---|
| 0 | login issue user employee manager name check u... | 0 |
| 1 | outlook receive hello team outlook calendar so... | 0 |
| 2 | cant log receive log best cant log receive log... | 0 |
| 3 | unable access page unable access | 0 |
| 7 | employment status new non employee enter user ... | 0 |
| ... | ... | ... |
| 8492 | option | 0 |
| 8493 | fi two add sorry another two need add please c... | 10 |
| 8495 | come mail receive good afternoon send mail ple... | 29 |
| 8496 | issue | 0 |
| 8497 | password reset | 0 |
5936 rows × 2 columns
top_20_grp = df_sampled['Assignment group'].value_counts().nlargest(20).index
for i in top_20_grp.values:
print("Word cloud for class ",i)
show_wordcloud(df_sampled[df_sampled['Assignment group'] == i]['text'])
Word cloud for class 0
Word cloud for class 8
Word cloud for class 12
Word cloud for class 9
Word cloud for class 2
Word cloud for class 24
Word cloud for class 6
Word cloud for class 3
Word cloud for class 19
Word cloud for class 13
Word cloud for class 10
Word cloud for class 5
Word cloud for class 14
Word cloud for class 25
Word cloud for class 4
Word cloud for class 29
Word cloud for class 18
Word cloud for class 17
Word cloud for class 16
Word cloud for class 33
def plt_freq_words(all_lines):
filtered_words = [word for word in all_lines.split() ]
counted_words = collections.Counter(filtered_words)
words = []
counts = []
for letter, count in counted_words.most_common(20):
words.append(letter)
counts.append(count)
colors = cm.rainbow(np.linspace(0, 1, 20))
rcParams['figure.figsize'] = 20, 10
plt.title('Top words in the headlines vs their count')
plt.xlabel('Count')
plt.ylabel('Words')
plt.barh(words, counts, color=colors)
plt_freq_words( ' '.join(df_sampled['text'].str.lower()))
plt_freq_words(' '.join(df_sampled[df_sampled['Assignment group'] == 0]['text'].str.lower()))
plt_freq_words(' '.join(df_sampled[df_sampled['Assignment group'] == 8]['text'].str.lower()))
plt_freq_words(' '.join(df_sampled[df_sampled['Assignment group'] == 24]['text'].str.lower()))
plt_freq_words(' '.join(df_sampled[df_sampled['Assignment group'] == 12]['text'].str.lower()))
def generate_N_grams(text,ngram):
words=[word for word in text.split(" ") if word not in STOPWORDS]
#print("Sentence after removing stopwords:",words)
temp=zip(*[words[i:] for i in range(0,ngram)])
ans=[' '.join(ngram) for ngram in temp]
return ans
Grp0Values=defaultdict(int)
Grp8Values=defaultdict(int)
Grp24Values=defaultdict(int)
Grp12Values=defaultdict(int)
#get the count of every word in both the columns of df_train and df_test dataframes
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==0].text:
for word in generate_N_grams(text,1):
Grp0Values[word]+=1
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==8].text:
for word in generate_N_grams(text,1):
Grp8Values[word]+=1
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==24].text:
for word in generate_N_grams(text,1):
Grp24Values[word]+=1
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==12].text:
for word in generate_N_grams(text,1):
Grp12Values[word]+=1
df_0Values=pd.DataFrame(sorted(Grp0Values.items(),key=lambda x:x[1],reverse=True))
df_8Values=pd.DataFrame(sorted(Grp8Values.items(),key=lambda x:x[1],reverse=True))
df_24Values=pd.DataFrame(sorted(Grp24Values.items(),key=lambda x:x[1],reverse=True))
df_12Values=pd.DataFrame(sorted(Grp12Values.items(),key=lambda x:x[1],reverse=True))
plt.figure(1,figsize=(16,4))
plt.bar(df_0Values[0][:10],df_0Values[1][:10], color ='green',
width = 0.4)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 0")
plt.savefig("positive-unigram.png")
plt.show()
Grp0Values_2=defaultdict(int)
Grp8Values_2=defaultdict(int)
Grp24Values_2=defaultdict(int)
Grp12Values_2=defaultdict(int)
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==0].text:
for word in generate_N_grams(text,2):
Grp0Values_2[word]+=1
for text in df_sampled[df_sampled['Assignment group']==8].text:
for word in generate_N_grams(text,2):
Grp8Values_2[word]+=1
for text in df_sampled[df_sampled['Assignment group']==24].text:
for word in generate_N_grams(text,2):
Grp24Values_2[word]+=1
for text in df_sampled[df_sampled['Assignment group']==12].text:
for word in generate_N_grams(text,2):
Grp12Values_2[word]+=1
df_0Values_2=pd.DataFrame(sorted(Grp0Values_2.items(),key=lambda x:x[1],reverse=True))
df_8Values_2=pd.DataFrame(sorted(Grp8Values_2.items(),key=lambda x:x[1],reverse=True))
df_24Values_2=pd.DataFrame(sorted(Grp24Values_2.items(),key=lambda x:x[1],reverse=True))
df_12Values_2=pd.DataFrame(sorted(Grp12Values_2.items(),key=lambda x:x[1],reverse=True))
plt.figure(1,figsize=(16,4))
plt.bar(df_8Values_2[0][:10],df_8Values_2[1][:10], color ='green',
width = 0.4)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 8 BIGRAM ANALYSIS")
plt.savefig("bigram8.png")
plt.show()
plt.figure(1,figsize=(16,4))
plt.bar(df_24Values_2[0][:10],df_24Values_2[1][:10], color ='green',
width = 0.4)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 24 BIGRAM ANALYSIS")
plt.savefig("positive-unigram.png")
plt.show()
plt.figure(1,figsize=(16,4))
plt.bar(df_12Values_2[0][:10],df_12Values_2[1][:10], color ='green',
width = 0.4)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 12 BIGRAM ANALYSIS")
plt.savefig("positive-unigram.png")
plt.show()
Grp0Values_3=defaultdict(int)
Grp8Values_3=defaultdict(int)
Grp24Values_3=defaultdict(int)
Grp12Values_3=defaultdict(int)
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==0].text:
for word in generate_N_grams(text,3):
Grp0Values_3[word]+=1
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==8].text:
for word in generate_N_grams(text,3):
Grp8Values_3[word]+=1
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==24].text:
for word in generate_N_grams(text,3):
Grp24Values_3[word]+=1
#get the count of every word in both the columns of df_train and df_test dataframes where sentiment="positive"
for text in df_sampled[df_sampled['Assignment group']==12].text:
for word in generate_N_grams(text,3):
Grp12Values_3[word]+=1
df_0Values_3=pd.DataFrame(sorted(Grp0Values_3.items(),key=lambda x:x[1],reverse=True))
df_8Values_3=pd.DataFrame(sorted(Grp8Values_3.items(),key=lambda x:x[1],reverse=True))
df_24Values_3=pd.DataFrame(sorted(Grp24Values_3.items(),key=lambda x:x[1],reverse=True))
df_12Values_3=pd.DataFrame(sorted(Grp12Values_3.items(),key=lambda x:x[1],reverse=True))
plt.figure(1,figsize=(16,4))
plt.bar(df_8Values_3[0][:10],df_8Values_3[1][:10], color ='green',
width = 0.4)
plt.xticks(rotation=90)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 8 TRIGRAM ANALYSIS")
plt.savefig("positive-unigram.png")
plt.show()
plt.figure(1,figsize=(16,4))
plt.bar(df_24Values_3[0][:10],df_24Values_3[1][:10], color ='green',
width = 0.4)
plt.xticks(rotation=90)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 24 TRIGRAM ANALYSIS")
plt.savefig("positive-unigram.png")
plt.show()
plt.figure(1,figsize=(16,4))
plt.bar(df_12Values_3[0][:10],df_12Values_3[1][:10], color ='green',
width = 0.4)
plt.xticks(rotation=90)
plt.xlabel("Words in dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in Group 12 TRIGRAM ANALYSIS")
plt.savefig("Trigram_12.png")
plt.show()
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.word as naw
#aug = nac.KeyboardAug()
aug = naw.SynonymAug()
aug.augment("tomorrow is working day")
from random import shuffle
def augment_text(df,samples,Grp_No):
##selecting the minority class samples
df_n=df[df['Assignment group']==Grp_No].reset_index(drop=True)
## data augmentation loop
for i in tqdm(np.random.randint(0,len(df_n),samples)):
text = df_n.iloc[i]['text']
new_text = aug.augment(text)
new=pd.DataFrame({'text':[new_text],'Assignment group':[Grp_No]})
df_n=(df_n.append(new).reset_index(drop=True))
## dataframe
return df_n
df_grp_8 = augment_text(df_sampled,200,8)
df_grp_12 = augment_text(df_sampled,400,12)
df_grp_9 = augment_text(df_sampled,400,9)
df_grp_2 = augment_text(df_sampled,400,2)
df_grp_24 = augment_text(df_sampled,400,24)
df_grp_6 = augment_text(df_sampled,400,6)
df_grp_3 = augment_text(df_sampled,500,3)
df_grp_19 = augment_text(df_sampled,500,19)
df_grp_13 = augment_text(df_sampled,500,13)
df_grp_10 = augment_text(df_sampled,500,10)
df_grp_5 = augment_text(df_sampled,500,5)
df_grp_14 = augment_text(df_sampled,500,14)
df_grp_25 = augment_text(df_sampled,500,25)
df_grp_4 = augment_text(df_sampled,500,4)
df_grp_29 = augment_text(df_sampled,500,29)
df_grp_18 = augment_text(df_sampled,500,18)
df_grp_17 = augment_text(df_sampled,500,17)
df_grp_16 = augment_text(df_sampled,500,16)
df_grp_33 = augment_text(df_sampled,500,33)
df_grp_0 = df_sampled[df_sampled['Assignment group']==0].sample(1000)
df_model = pd.concat([df_grp_8, df_grp_12, df_grp_9,df_grp_2,df_grp_24,df_grp_6,df_grp_3 ,df_grp_19,df_grp_13,df_grp_10,df_grp_5,df_grp_14,df_grp_25,df_grp_4,df_grp_29,df_grp_18,df_grp_17,df_grp_16,df_grp_33,df_grp_0 ])
title_rating = df_model.groupby('Assignment group').agg('count')
rating_labels = title_rating.text.sort_values().index
rating_counts = title_rating.text.sort_values()
plt.figure(1, figsize=(40,70))
the_grid = GridSpec(2, 2)
cmap = plt.get_cmap('Spectral')
colors = [cmap(i) for i in np.linspace(0, 1, 8)]
plt.subplot(the_grid[0, 1], aspect=1, title='Percentage of Each Group')
type_show_ids = plt.pie(rating_counts, labels=rating_labels, autopct='%1.1f%%', shadow=True, colors=colors)
plt.show()
df_model
LABEL = 'Assignment group'
from sklearn.utils import class_weight
#train_x, rem_x, train_y, rem_y = model_selection.train_test_split(df_model[TEXT], df_model[LABEL], random_state=42, stratify=df_model[LABEL], train_size=0.8)
#valid_x, test_x, valid_y, test_y = model_selection.train_test_split(rem_x,rem_y, test_size=0.5,random_state=42)
train_x_org, rem_x_org, train_y_org, rem_y_org = model_selection.train_test_split(df_sampled[TEXT], df_sampled[LABEL], random_state=42, stratify=df_sampled[LABEL], train_size=0.8)
valid_x_org, test_x_org, valid_y_org, test_y_org = model_selection.train_test_split(rem_x_org, rem_y_org, random_state=42, train_size=0.5)
df_results = pd.DataFrame()
def apply_pca (X):
cov_matrix_1 = np.cov(X.T)
#print("covariance matrix of part 2\n",cov_matrix_3)
eigen_value_1, eigen_vector_1 = np.linalg.eig(cov_matrix_1)
#how to find cumuative variance in PCA
tol_1 = sum(eigen_value_1)
var_eigen_value = [(i/tol_1) * 100 for i in sorted(eigen_value_1,reverse = True)]
cum_eigen_val = np.cumsum(var_eigen_value)
#print("cumulative variance",cum_eigen_val)
plt.plot(cum_eigen_val)
return eigen_value_1,var_eigen_value,cum_eigen_val
Count Vectorization involves counting the number of occurrences each words appears in a document (i.e distinct text such as an article, book, even a paragraph!). Python’s Sci-kit learn library has a tool called CountVectorizer to accomplish this.
def apply_count_vect(xtrain,xvalid,xtest):
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',max_features = 300)
count_vect.fit(xtrain)
# transform the training and validation data using count vectorizer object
xtraincount = count_vect.transform(xtrain).toarray()
xvalidcount = count_vect.transform(xvalid).toarray()
xtestcount = count_vect.transform(xtest).toarray()
return xtraincount,xvalidcount,xtestcount
TfidfVectorizer - Transforms text to feature vectors that can be used as input to estimator.vocabulary_ Is a dictionary that converts each token (word) to feature index in the matrix, each unique token gets a feature index.
def apply_tf_idf_vect(xtrain,xvalid,xtest):
tfidf_vect = TfidfVectorizer(ngram_range = (1,3),max_features = 300)
tfidf_vect.fit(xtrain)
# word level tf-idf
#tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=10000)
tfidf_vect_train = tfidf_vect .transform(xtrain)
tfidf_vect_val = tfidf_vect .transform(xvalid)
tfidf_vect_test = tfidf_vect .transform(xtest)
return tfidf_vect_train.toarray(),tfidf_vect_val.toarray(),tfidf_vect_test.toarray()
def fit_pca(n_component,x_train,x_test,x_val):
pca_1 = PCA(n_components = n_component, random_state = 1)
pca_1.fit(x_train)
x_train_pca = pca_1.fit_transform(x_train)
x_test_pca = pca_1.transform(x_test)
x_valid_pca = pca_1.transform(x_val)
return x_train_pca,x_test_pca,x_valid_pca
def fit_tokenizer(xtrain,xvalid,xtest):
token = Tokenizer()
token.fit_on_texts(xtrain)
word_index = token.word_index
# convert text to sequence of tokens and pad them to ensure equal length vectors
train_seq = sequence.pad_sequences(token.texts_to_sequences(xtrain), maxlen=300)
valid_seq = sequence.pad_sequences(token.texts_to_sequences(xvalid), maxlen=300)
test_seq = sequence.pad_sequences(token.texts_to_sequences(xtest), maxlen=300)
return train_seq,valid_seq,test_seq
%%time
xtrain_count,xvalid_count,xtest_count = apply_count_vect(train_x,valid_x,test_x)
xtrain_count_org,xvalid_count_org,xtest_count_org = apply_count_vect(train_x_org,valid_x_org,test_x_org)
tfidf_vect_train_count,tfidf_vect_val_count,tfidf_vect_test_count = apply_tf_idf_vect(train_x_org,valid_x_org,test_x_org)
Sampling - SMOTE, up sampling & down sampling
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
oversample = SMOTE('all' )
roversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
rundersampler = RandomUnderSampler(sampling_strategy='majority')
tfidf_vect_train_count_smote, train_y_org_smote = oversample.fit_resample(tfidf_vect_train_count, train_y_org)
tfidf_vect_train_count_rovsamp, train_y_org_rovsamp = roversampler.fit_resample(tfidf_vect_train_count, train_y_org)
tfidf_vect_train_count_rundrsamp, train_y_org_rundrsamp = rundersampler.fit_resample(tfidf_vect_train_count, train_y_org)
print(tfidf_vect_train_count_smote.shape, train_y_org_smote.shape)
print(tfidf_vect_train_count_rovsamp.shape, train_y_org_rovsamp.shape)
print(tfidf_vect_train_count_rundrsamp.shape, train_y_org_rundrsamp.shape)
(43780, 300) (43780,) (6877, 300) (6877,) (2619, 300) (2619,)
xtrain_count_org_smote, train_y_count_org_smote = oversample.fit_resample(xtrain_count_org, train_y_org)
xtrain_count_org_rovsamp, train_y_count_org_rovsamp = roversampler.fit_resample(xtrain_count_org, train_y_org)
xtrain_count_org_rundrsamp, train_y_count_org_rundrsamp = rundersampler.fit_resample(xtrain_count_org, train_y_org)
print(xtrain_count_org_smote.shape, train_y_count_org_smote.shape)
print(xtrain_count_org_rovsamp.shape, train_y_count_org_rovsamp.shape)
print(xtrain_count_org_rundrsamp.shape, train_y_count_org_rundrsamp.shape)
(43780, 300) (43780,) (6877, 300) (6877,) (2619, 300) (2619,)
Tokenizing Text -> Repsesenting each word by a number Mapping of orginal word to number is preserved in word_index property of tokenizer Tokenized applies basic processing like changing it to lower case, explicitely setting that as False Lets keep all news to 300, add padding to news with less than 300 words and truncating long ones
num_words = 300 + 1
embedding_size = 50
# create a tokenizer
token = Tokenizer()
token.fit_on_texts(train_x_org)
word_index = token.word_index
# convert text to sequence of tokens and pad them to ensure equal length vectors
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x_org), maxlen=300)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x_org), maxlen=300)
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(test_x_org), maxlen=300)
import pickle
# Dict of metrics to use in the model selection
score_metrics = {'accuracy': accuracy_score,
#'balanced_accuracy': balanced_accuracy_score,
'precision_score': precision_score,
'recall_score': recall_score,
'f1-score': f1_score,
#'tp': tp, 'tn': tn,
#'fp': fp, 'fn': fn,
#'cohens_kappa':cohen_kappa_score,
#'matthews_corrcoef':matthews_corrcoef,
#"roc_auc":roc_auc_score
}
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.ensemble import RandomForestClassifier
def report(clf, x, y, name='classifier', cv=5, dict_scoring=None, fit_params=None):
score = {'accuracy': make_scorer(accuracy_score),
#'prec': make_scorer(average_precision_score, average = 'weighted')
'f1-score': make_scorer(sklearn.metrics.f1_score, average = 'weighted'),
'precision_score': make_scorer(sklearn.metrics.precision_score, average = 'weighted'),
'recall_score': make_scorer(sklearn.metrics.recall_score, average = 'weighted')
}
scores = model_selection.cross_validate(clf, x, y.values, scoring=score,cv=cv, fit_params=fit_params ,return_train_score=True,verbose=0)
model_name = name+"_model"
model_name = clf.fit (x, y)
pickle.dump(model_name, open(name+"_model", 'wb'))
#print("pickled model")
index = []
value = []
index.append("Model")
value.append(name)
for i in scores:
if i == "estimator":
continue
for j in enumerate(scores[i]):
index.append(i+"_cv"+str(j[0]+1))
value.append(j[1])
index.append(i+"_mean")
value.append(np.mean(scores[i]))
index.append(i+"_std")
value.append(np.std(scores[i]))
#print(i,np.mean(scores[i]))
return pd.DataFrame(data=value, index=index).T
df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_count,train_y, name='NB_Count_Vectors', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), tfidf_vect_train_count,train_y, name='NB_TF-IDF', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), train_seq_x,train_y, name='NB_seq', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_count_org,train_y_org, name='NB_Count_Vectors_org', cv=5, dict_scoring=score_metrics))
#df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_count,train_y, name='NB_Count_Vectors', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), train_seq_x,train_y_org, name='NB_Seq_Data', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_count_org_smote,train_y_count_org_smote, name='NB_Count_Vectors_org_SMOTE', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), tfidf_vect_train_count_smote,train_y_org_smote, name='NB_TF-IDF_SMOTE', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_count_org_rovsamp,train_y_count_org_rovsamp, name='NB_Count_Vectors_org_rovsamp', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), tfidf_vect_train_count_rovsamp,train_y_org_rovsamp, name='NB_TF-IDF_rovsamp', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_count_org_rundrsamp,train_y_count_org_rundrsamp, name='NB_Count_Vectors_org_rundrsamp', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(naive_bayes.MultinomialNB(), tfidf_vect_train_count_rundrsamp,train_y_org_rundrsamp, name='NB_TF-IDF_rundrsamp', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_count,train_y, name='LR_Count_Vector', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), tfidf_vect_train_count,train_y, name='LR_TF-IDF', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), train_seq_x,train_y, name='LR_seq', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_count_org,train_y_org, name='LR_count_vector_org', cv=5, dict_scoring=score_metrics))
#df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_count,train_y, name='LR_Count_Vector', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), train_seq_x,train_y_org, name='LR_Seq_Data', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), tfidf_vect_train_count_smote,train_y_org_smote, name='LR_TF-IDF_SMOTE', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_count_org_smote,train_y_count_org_smote, name='LR_count_vector_org_SMOTE', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), tfidf_vect_train_count_rovsamp,train_y_org_rovsamp, name='LR_TF-IDF_rovsamp', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_count_org_rovsamp,train_y_count_org_rovsamp, name='LR_count_vector_org_rovsamp', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), tfidf_vect_train_count_rundrsamp,train_y_org_rundrsamp, name='LR_TF-IDF_rundrsamp', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_count_org_rundrsamp,train_y_count_org_rundrsamp, name='LR_count_vector_org_rndrsamp', cv=5, dict_scoring=score_metrics))
fit_params={'early_stopping_rounds':5,'eval_set':[(xvalid_count, valid_y)], 'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), xtrain_count,train_y, name='XGB_Count_Vectors', cv=3, fit_params=fit_params, dict_scoring=score_metrics ))
fit_params={'early_stopping_rounds':5,'eval_set':[(tfidf_vect_val_count, valid_y)],'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), tfidf_vect_train_count,train_y, name='XGB_TF_IDF', cv=3, fit_params=fit_params, dict_scoring=score_metrics))
fit_params={'early_stopping_rounds':10,'eval_set':[(valid_seq_x, valid_y)],'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), train_seq_x,train_y, name='XGB_seq', cv=3, fit_params=fit_params, dict_scoring=score_metrics))
#fit_params={'early_stopping_rounds':5,'eval_set':[(xvalid_count, valid_y)], 'metric' :'multiclass','eval_metric':'mlogloss'}
#df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), xtrain_count,train_y, name='XGB_Count_Vectors', cv=3, fit_params=fit_params, dict_scoring=score_metrics ))
fit_params={'early_stopping_rounds':10,'eval_set':[(valid_seq_x, valid_y_org)],'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), train_seq_x,train_y_org, name='XGB_Seq_Data', cv=3, fit_params=fit_params, dict_scoring=score_metrics))
fit_params={'early_stopping_rounds':5,'eval_set':[(tfidf_vect_val_count, valid_y_org)],'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), tfidf_vect_train_count_smote,train_y_org_smote, name='XGB_TF_IDF_SMOTE', cv=3, fit_params=fit_params, dict_scoring=score_metrics))
fit_params={'early_stopping_rounds':5,'eval_set':[(xvalid_count_org, valid_y_org)],'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), xtrain_count_org_smote,train_y_count_org_smote, name='XGB_Count_Vector_org_SMOTE', cv=3, fit_params=fit_params, dict_scoring=score_metrics))
fit_params={'early_stopping_rounds':5,'eval_set':[(tfidf_vect_val_count, valid_y_org)],'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), tfidf_vect_train_count_rovsamp,train_y_org_rovsamp, name='XGB_TF_IDF_rovsamp', cv=3, fit_params=fit_params, dict_scoring=score_metrics))
fit_params={'early_stopping_rounds':5,'eval_set':[(xvalid_count_org, valid_y_org)],'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), xtrain_count_org_rovsamp,train_y_count_org_rovsamp, name='XGB_count_vector_org_rovsamp', cv=3, fit_params=fit_params, dict_scoring=score_metrics))
fit_params={'early_stopping_rounds':5,'eval_set':[(tfidf_vect_val_count, valid_y_org)],'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), tfidf_vect_train_count_rundrsamp,train_y_org_rundrsamp, name='XGB_TF_IDF_rundrsamp', cv=3, fit_params=fit_params, dict_scoring=score_metrics))
fit_params={'early_stopping_rounds':5,'eval_set':[(xvalid_count_org, valid_y_org)],'metric' :'multiclass','eval_metric':'mlogloss'}
df_results = df_results.append(report(XGBClassifier(n_estimators=100, subsample=0.8), xtrain_count_org_rundrsamp,train_y_count_org_rundrsamp, name='XGB_count_vector_org_rundrsamp', cv=3, fit_params=fit_params, dict_scoring=score_metrics))
[11:06:54] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [11:07:33] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [11:11:48] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [11:15:56] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [11:16:37] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [11:17:16] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [11:17:30] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
df_results = df_results.append(report(RandomForestClassifier(n_estimators = 500,random_state=40, n_jobs = -1,oob_score=True), train_seq_x, train_y_org, name='RF_Seq_Data', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(RandomForestClassifier(n_estimators = 500,random_state=40, n_jobs = -1,oob_score=True), tfidf_vect_train_count_smote,train_y_org_smote, name='RF_TF-IDF_SMOTE', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(RandomForestClassifier(n_estimators = 500,random_state=40, n_jobs = -1,oob_score=True), xtrain_count_org_smote,train_y_count_org_smote, name='RF_count_vector_org_SMOTE', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(RandomForestClassifier(n_estimators = 500,random_state=40, n_jobs = -1,oob_score=True), tfidf_vect_train_count_rovsamp,train_y_org_rovsamp, name='RF_TF-IDF_rovsamp', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(RandomForestClassifier(n_estimators = 500,random_state=40, n_jobs = -1,oob_score=True), xtrain_count_org_rovsamp,train_y_count_org_rovsamp, name='RF_count_vector_org_rovsamp', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(RandomForestClassifier(n_estimators = 500,random_state=40, n_jobs = -1,oob_score=True), tfidf_vect_train_count_rundrsamp,train_y_org_rundrsamp, name='RF_TF-IDF_rundrsamp', cv=5, dict_scoring=score_metrics))
df_results = df_results.append(report(RandomForestClassifier(n_estimators = 500,random_state=40, n_jobs = -1,oob_score=True), xtrain_count_org_rundrsamp,train_y_count_org_rundrsamp, name='RF_count_vector_org_rndrsamp', cv=5, dict_scoring=score_metrics))
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_x_org)
encodings_train = tokenizer.texts_to_sequences(train_x_org)
encodings_test = tokenizer.texts_to_sequences(test_x_org)
encodings_val = tokenizer.texts_to_sequences(valid_x_org)
GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.
EMBEDDING_FILE = 'C:\\Users\\Create\\Downloads\\glove.6B.300d.txt'
embeddings = {}
for o in open(EMBEDDING_FILE, encoding='utf8'):
word = o.split(' ')[0]
embd = o.split(' ')[1:]
embd = np.asarray(embd, dtype='float32')
embeddings[word] = embd
%%time
# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
words = []
for word, i in tqdm(word_index.items()):
embedding_vector = pretrained.get_word_vector(word) #embeddings_index.get(word)
words.append(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
es = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='auto', patience=3)
check_p = tf.keras.callbacks.ModelCheckpoint("save_models/model.h5", save_best_only=True)
pretrained= fasttext.FastText.load_model('Fasttext.h5')
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
def cross_validate_NN(model, X, y, X_test, y_test, callbacks,name="NN", fit_params=None, scoring=None, n_splits=5):
#print(model.__class__.__name__)
# ---- Parameters initialisation
seed = 42
k = 1
np.random.seed(seed)
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
# Creation of list for each metric
if scoring==None:
dic_scoring = {}
if scoring!=None:
dic_score = scoring.copy()
dic_score["fit_time"] = None
dic_score["score_time"] = None
scorer = {}
for i in dic_score.keys():
scorer[i] = []
index = ["Model"]
results = [name]
# ---- Loop on k-fold for cross-valisation
for train, test in kfold.split(X, y):
fit_start = time.time()
_model = model
_model.fit(X[train], y[train],
epochs=10, callbacks=[callbacks],
validation_split=0.2, verbose=False)
fit_end = time.time() - fit_start
_acc = _model.evaluate(X_test, y_test, verbose=0)
score_start = time.time()
y_pred = np.argmax(model.predict(X_test),axis=1)
score_end = time.time() - score_start
# ---- save each metric
for i in dic_score.keys():
if i == "fit_time":
scorer[i].append(fit_end)
index.append(i+'_cv'+str(k))
results.append(fit_end)
continue
if i == "score_time":
scorer[i].append(score_end)
index.append(i+'_cv'+str(k))
results.append(score_end)
continue
if i == "accuracy":
#
scorer[i].append(dic_score[i](y_test, y_pred))
index.append("test_"+i+'_cv'+str(k))
results.append(scorer[i][-1])
continue
scorer[i].append(dic_score[i](y_test, y_pred,average='weighted'))
index.append("test_"+i+'_cv'+str(k))
results.append(scorer[i][-1])
k+=1
# Compute mean and std for each metric
for i in scorer:
results.append(np.mean(scorer[i]))
results.append(np.std(scorer[i]))
if i == "fit_time":
index.append(i+"_mean")
index.append(i+"_std")
continue
if i == "score_time":
index.append(i+"_mean")
index.append(i+"_std")
continue
index.append("test_"+i+"_mean")
index.append("test_"+i+"_std")
# pickle.dump(model, open("_model", 'wb'))
model.save(name+'model.h5')
return pd.DataFrame(results, index=index).T
from tensorflow import keras
labels = 20
train_y_org.unique()
array([ 0, 10, 8, 4, 6, 16, 5, 29, 13, 19, 33, 9, 2, 24, 12, 18, 25,
17, 14, 3])
len(word_index)
2589
def create_rnn_model(word_index):
embedded = keras.layers.Embedding(len(word_index) +1, 100)
model = keras.Sequential([
embedded,
keras.layers.SimpleRNN(100, return_sequences=True),
keras.layers.SimpleRNN(100, return_sequences=True),
keras.layers.Dense(20, activation='sigmoid'),
keras.layers.Dropout(0.2),
keras.layers.Dense(1, activation='softmax')])
model.compile(optimizer='adam', loss=tf.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])
return model
df_results = df_results.append(cross_validate_NN(create_rnn_model(word_index), train_seq_x, train_y_org.values,valid_seq_x, valid_y_org.values, es, name="RNN_Seq",scoring=score_metrics, n_splits=3))
df_results = df_results.append(cross_validate_NN(create_rnn_model(word_index), tfidf_vect_train_count, train_y_org.values, tfidf_vect_val_count, valid_y_org.values, es, name="RNN_TFIDF",scoring=score_metrics, n_splits=3))
def create_lstm_model(word_index):
embedded = keras.layers.Embedding(len(word_index) +1, 100)
model = keras.Sequential([
embedded,
keras.layers.LSTM(256),
keras.layers.Dropout(0.25),
keras.layers.Dense(1, activation='softmax')])
model.compile(optimizer='adam', loss=tf.losses.CategoricalCrossentropy(), metrics=['accuracy'])
return model
df_results = df_results.append(cross_validate_NN(create_lstm_model(word_index), tfidf_vect_train_count, train_y_org.values, tfidf_vect_val_count, valid_y_org.values, es, name="LSTM_TFIDF",scoring=score_metrics, n_splits=4))
Bidirectional LSTM
def create_bilstm_model(word_index):
embedded = keras.layers.Embedding(len(word_index) +1, 100)
model = keras.Sequential([
embedded,
#keras.layers.SpatialDropout1D(0.2),
keras.layers.Bidirectional(keras.layers.LSTM(64, recurrent_dropout=0.1, return_sequences=True)),
keras.layers.Bidirectional(keras.layers.LSTM(64, recurrent_dropout=0.1)),
keras.layers.Dense(32),
keras.layers.Dropout(0.2),
keras.layers.Dense(1, activation='softmax')])
model.compile(optimizer='adam', loss=tf.losses.CategoricalCrossentropy(), metrics=['accuracy'])
return model
df_results = df_results.append(cross_validate_NN(create_bilstm_model(word_index), tfidf_vect_train_count, train_y_org.values, tfidf_vect_val_count, valid_y_org.values, es, name="Bi-LSTM_TFIDF",scoring=score_metrics, n_splits=4))
def create_cnn_gru_model(word_index):
embedded = keras.layers.Embedding(len(word_index) + 1, 100)
model = keras.Sequential([
embedded,
keras.layers.Conv1D(128, 5, activation='relu'),
keras.layers.Dropout(0.2),
keras.layers.MaxPooling1D(pool_size=4),
keras.layers.GRU(32),
keras.layers.Dense(1, activation='softmax')])
model.compile(optimizer='adam', loss=tf.losses.CategoricalCrossentropy(), metrics=['accuracy'])
return model
df_results = df_results.append(cross_validate_NN(create_cnn_gru_model(word_index),tfidf_vect_train_count, train_y_org.values, tfidf_vect_val_count, valid_y_org.values, es, name="CNN_GRU_WE", scoring=score_metrics, n_splits=2))
def create_gru_model(word_index):
embedded = keras.layers.Embedding(len(word_index) + 1, 100)
model = keras.Sequential([
embedded,
keras.layers.GRU(32),
keras.layers.Dropout(0.2),
keras.layers.Dense(1, activation='softmax')])
model.compile(optimizer='adam', loss=tf.losses.CategoricalCrossentropy(), metrics=['accuracy'])
return model
df_results = df_results.append(cross_validate_NN(create_gru_model(word_index), train_seq_x, train_y_org.values, valid_seq_x, valid_y_org.values, es, name="GRU_WE", scoring=score_metrics, n_splits=2))
df_results[[ "Model","test_accuracy_mean","train_accuracy_mean","test_f1-score_mean", "test_f1-score_std","test_recall_score_mean","test_precision_score_mean" ]][df_results["test_f1-score_mean"]<1].sort_values(by=["test_f1-score_mean"], ascending=False).head(5)
| Model | test_accuracy_mean | train_accuracy_mean | test_f1-score_mean | test_f1-score_std | test_recall_score_mean | test_precision_score_mean | |
|---|---|---|---|---|---|---|---|
| 0 | RF_TF-IDF_SMOTE | 0.763111 | 0.801005 | 0.770153 | 0.008093 | 0.763111 | 0.877786 |
| 0 | RF_count_vector_org_SMOTE | 0.7246 | 0.746619 | 0.739739 | 0.005031 | 0.7246 | 0.864671 |
| 0 | RF_count_vector_org_rovsamp | 0.730987 | 0.916279 | 0.666814 | 0.003757 | 0.730987 | 0.668398 |
| 0 | LR_count_vector_org_SMOTE | 0.644655 | 0.654585 | 0.654912 | 0.006365 | 0.644655 | 0.779748 |
| 0 | LR_count_vector_org_rovsamp | 0.686782 | 0.822415 | 0.648763 | 0.005368 | 0.686782 | 0.644503 |
df_results[[ "Model","test_accuracy_mean","test_f1-score_mean", "test_f1-score_std","test_recall_score_mean","test_precision_score_mean" ]][df_results["test_f1-score_mean"]<1].sort_values(by=["test_f1-score_mean"], ascending=False).tail(5)
| Model | test_accuracy_mean | test_f1-score_mean | test_f1-score_std | test_recall_score_mean | test_precision_score_mean | |
|---|---|---|---|---|---|---|
| 0 | RNN_Seq | 0.469697 | 0.300219 | 0.0 | 0.469697 | 0.220615 |
| 0 | RNN_WE | 0.469697 | 0.300219 | 0.0 | 0.469697 | 0.220615 |
| 0 | NB_TF-IDF_rundrsamp | 0.365411 | 0.270152 | 0.009655 | 0.365411 | 0.300386 |
| 0 | NB_Count_Vectors_org_rundrsamp | 0.23902 | 0.222252 | 0.012875 | 0.23902 | 0.2996 |
| 0 | NB_Seq_Data | 0.079612 | 0.08365 | 0.01605 | 0.079612 | 0.396537 |
keras.models.load_model('GRU_WEmodel.h5').summary()
Model: "sequential_62" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_63 (Embedding) (None, None, 100) 259000 _________________________________________________________________ gru_2 (GRU) (None, 32) 12864 _________________________________________________________________ dropout_48 (Dropout) (None, 32) 0 _________________________________________________________________ dense_98 (Dense) (None, 1) 33 ================================================================= Total params: 271,897 Trainable params: 271,897 Non-trainable params: 0 _________________________________________________________________
keras.models.load_model('RNN_seqmodel.h5').summary()
Model: "sequential_25" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_26 (Embedding) (None, None, 100) 259000 _________________________________________________________________ simple_rnn_102 (SimpleRNN) (None, None, 100) 20100 _________________________________________________________________ simple_rnn_103 (SimpleRNN) (None, None, 100) 20100 _________________________________________________________________ dense_45 (Dense) (None, None, 20) 2020 _________________________________________________________________ dropout_14 (Dropout) (None, None, 20) 0 _________________________________________________________________ dense_46 (Dense) (None, None, 1) 21 ================================================================= Total params: 301,241 Trainable params: 301,241 Non-trainable params: 0 _________________________________________________________________
GRU, LSTM and RNN returns very less f1-score and less accuracy
Hypertuning in Deep learning models could probably increase the accuracy even further
Grid search will be used in Hypertuning to increase score metrics
Maximum accuracy we could achieve with limited training sample and limited resources is 0.61 using Logistic regression + Count Vectoriser
Created a embedding layer with weight matrix using GloVe embeddings , compiled a GRU model which resulted in f1 score of 0.082558